#This script calculates the cosine similarity scores for the synthetic articles
#It does so using pre-trained embeddings and transformation matrices
library(quanteda)
library(conText)
library(dplyr)
library(text2vec)
library(data.table)
source("utils.R")

set.seed(123L)

data <- read.csv("data/synthetic/synthetic_articles_all_gpt-3.5-turbo_cleaned.csv")
terms <- read.csv("data/synthetic/oppsup_synthetic_terms.csv")
# Define the list of countries and their codes
countries <- unique(terms$langcode)

language_codes <- c(
  English = "en",
  French = "fr",
  Spanish = "es",
  Russian = "ru",
  Mandarin = "zh-CN",
  Arabic = "ar",
  Japanese = "ja",
  Korean = "ko"
)

# Create grouping variable
data <- data %>%
  group_by(Language) %>%  # Group by country/language code
  mutate(
    article_index = row_number(),  # Create an article index
    yearwk = ceiling(article_index / 50)  # Group every 50 articles
  ) %>%
  ungroup()  # Ungroup for further processing

# Add the new 'langcode' column to the 'data' data frame
data <- data %>%
  mutate(langcode = language_codes[Language])

# Define the embedding dimensionality for each language code
embedding_dims <- c("ar" = 15, "en" = 50, "fr" = 25, "es" = 25, "ja" = 10, "ko" = 15, "ru" = 25, "zh-CN" = 15)

# Function to get cosine similarities
process_cos_sim <- function(country_code) {
  # Subset the terms and data for the language
  term <- terms %>% filter(langcode == country_code)
  data_subset <- data %>% filter(langcode == country_code)
  
  # Define the support and opposition words for the language
  first_synth <- term$oppword
  second_synth <- term$supword
  
  # Get the dimensionality for the embedding files
  dimension <- embedding_dims[country_code]
  
  # Construct the file paths dynamically
  local_transform_path <- paste0("data/pretrained_embedding/alcembeddings/gloVe_", country_code, "/glove_transform_", country_code, "wiki_", dimension, ".rds")
  local_glove_path <- paste0("data/pretrained_embedding/alcembeddings/gloVe_", country_code, "/glove_vectors_", country_code, "wiki.txt")
  
  # Check if files exist before reading
  if (file.exists(local_transform_path) & file.exists(local_glove_path)) {
    local_transform <- as.matrix(readRDS(local_transform_path))
    
    # Read the full glove data
    glove_data <- fread(local_glove_path, header = FALSE)
    
    # Split the words and vectors
    glove_words <- glove_data[, V1]
    glove_vectors <- glove_data[, -1, with = FALSE]
    
    # Convert vectors to matrix and set row names
    glove_vectors <- as.matrix(glove_vectors)
    rownames(glove_vectors) <- glove_words
    
    # Now glove_vectors has words as row names
    local_glove <- glove_vectors
  } else {
    stop(paste("Embedding files for", country_code, "do not exist."))
  }
  
  data_subset <- corpus(data_subset, text_field = "Article")
  
  # Process the cosine similarities
  cos_simsdf_all <- get_similarity_scores(
    x = data_subset, 
    target = "POLITFIG",
    first_vec = first_synth, 
    second_vec = second_synth, 
    pre_trained = local_glove,
    transform_matrix = local_transform,
    window = 12L,
    group_var = "yearwk",
    norm = "l2"
  )
  
  # Return results
  cos_simsdf_all$country_code <- country_code  # This line adds the country code to the results
  return(cos_simsdf_all)
}

# Loop through countries and apply the process_cos_sim function
results_list <- lapply(countries, process_cos_sim)
if (!dir.exists("data/output/cos_sims_synthetic")) {
  dir.create("data/output/cos_sims_synthetic_gpt-3.5-turbo", recursive = TRUE)
}
saveRDS(results_list,"data/output/cos_sims_synthetic_gpt-3.5-turbo/cos_sims_synthetic_all.rds")

# Save the results
for (i in seq_along(countries)) {
  dir_name <- paste0("data/output/cos_sims_synthetic_gpt-3.5-turbo/", countries[i])
  if (!dir.exists(dir_name)) {
    dir.create(dir_name, recursive = TRUE)
  }
  
  saveRDS(results_list[[i]], paste0(dir_name, "/", "cos_simsdf_all.rds"))
}